Data is collected with Wikicrawl
In [ ]:
%pylab inline
plt.rc('figure', figsize=(8, 5))
import os
# from collections import Counter
from fnmatch import fnmatch
import yaml
import networkx as nx
In [3]:
def slugify(value):
import unicodedata
import re
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
value
if type(value) is str:
value=value.decode('utf-8');
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
value = unicode(re.sub('[-\s]+', '_', value))
return value.capitalize()
In [4]:
corpus_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
wk_path = os.path.join(corpus_path, 'wikipedia')
wk_en_path = os.path.join(wk_path, 'en')
print wk_en_path
In [57]:
# wikipedia
categories=nx.Graph()
for path, subdirs, files in os.walk(wk_en_path):
here=os.path.split(path)[1]
parent=os.path.split(os.path.split(path)[0])[1]
# print parent, " -> ", here
categories.add_edge(parent, here)
categories[parent]["path"]=path
categories[here]["path"]=path
for name in files:
if fnmatch(name, "*.yaml"): # check if there is a text file
category_name=name[0:-5]
yaml_file_path = os.path.join(path, category_name+".yaml")
# yaml
yaml_file = open(yaml_file_path, "r")
docs = yaml.load_all(yaml_file)
# category_name
for doc in docs:
cat_parent=doc["CategoryPath"][0]
categories.add_edge(slugify(cat_parent), slugify(category_name))
categories[slugify(cat_parent)]["path"]=path
categories[slugify(category_name)]["path"]=path
# print slugify(cat_parent)," -> ", slugify(category_name)
for cat in doc["Categories"][0]["en"]:
categories.add_edge(slugify(category_name), slugify(cat))
# print slugify(category_name), " -> ", slugify(cat)
categories[slugify(cat)]["path"]=path
# print
print("The categories graph %s has %d nodes with %d edges"
%(categories.name,nx.number_of_nodes(categories),nx.number_of_edges(categories)))
In [8]:
nx.draw_networkx(categories, node_size=15, edge_color='y', with_labels=False, alpha=.4, linewidths=0)
In [20]:
def nx_to_gv_file(_nx_graph, _name, _dir_path):
'''
Convert meme corpus to Graphviz file
'''
# t0=time()
#
gv_filepath=_dir_path+"/"+_name+".gv"
viz_filepath=_dir_path+"/"+_name+".png"
with open(gv_filepath,'w') as f:
line = "digraph mentions {\n" # open .gv file
f.write(line)
for i,edge in enumerate(_nx_graph.edges()):
line='"'+edge[0]+'"'+"->"+'"'+edge[1]+'"'+"\n"
# print line
f.write(line)
line = "}"+"\n" # close .gv file
f.write(line)
print " graphiz file saved as %s"%gv_filepath
# draw with graphviz
command = "sfdp -Gbgcolor=black -Ncolor=white -Ecolor=white -Nwidth=0.05 -Nheight=0.05 -Nfixedsize=true -Nlabel='' -Earrowsize=0.4 -Gsize=75 -Gratio=fill -Tpng " + gv_filepath + " > " + viz_filepath
os.system(command)
print "viz graph saved as %s"%viz_filepath
nx_to_gv_file(categories, "wiki_cat", os.getcwd())
In [19]:
import csv
def list_to_csv(_keys,_rows,_csv_filepath):
with open(_csv_filepath,'w') as f: # writes the final output to CSV
csv_out=csv.writer(f)
csv_out.writerow(_keys) # add header
for row in _rows:
csv_out.writerow(row)
print " csv has been stored as %s"%_csv_filepath
def nx_to_gephi_csv(_nx_graph, _name, _dir_path):
list_to_csv(["Id", "Label"],_nx_graph.nodes(),_dir_path + '/'+_name+'_nodes.csv')
list_to_csv(["Source","Target"],_nx_graph.edges(),_dir_path +'/'+_name+'_edges.csv')
print "graph files (nodes+edges) saved at %s"%_dir_path
nx_to_gephi_csv(categories, "wiki_cat", os.getcwd())
In [77]:
import os
from random import choice
first_node = choice(categories.nodes()) # pick a random node
possible_nodes = set(categories.nodes())
neighbours = categories.neighbors(first_node) + [first_node]
possible_nodes.difference_update(neighbours) # remove the first node and all its neighbours from the candidates
second_node = choice(list(possible_nodes)) # pick second node
print first_node, second_node
In [78]:
# print categories[first_node]["path"]
# print categories[second_node]["path"]
for file in os.listdir(categories[first_node]["path"]):
if file.endswith(".txt"):
print os.path.join(categories[first_node]["path"], file)
for file in os.listdir(categories[second_node]["path"]):
if file.endswith(".txt"):
print os.path.join(categories[second_node]["path"], file)